library(tidyverse)
library(janitor)
library(ggfortify)
library(GGally)
library(modelr)
prices <- read_csv("data/kc_house_data.csv")
Rows: 21613 Columns: 21── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): id
dbl  (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_abo...
dttm  (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) : 
  invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) : 
  attempt to use zero-length variable name
prices
glimpse(prices)
Rows: 21,613
Columns: 21
$ id            <chr> "7129300520", "6414100192", "5631500400", "2487200875", "1954400510", "7237550310", "132140006…
$ date          <dttm> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-02-18, 2014-05-12, 2014-06-27, 2015-01-1…
$ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 323000, 662500, 46800…
$ bedrooms      <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3, 3, 3, 3, 3, 4, 3, 2…
$ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.00, 1.75, 2.00, 3.00…
$ sqft_living   <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 1430, 1370, 1810, 2950,…
$ sqft_lot      <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000, 19901, 9680, 4850, …
$ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5, 2.0, 2.0, 1.5, 1.0,…
$ waterfront    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ view          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ condition     <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4, 5, 3, 5, 3, 3, 3, 3…
$ grade         <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8, 6, 8, 8, 7, 8, 8, …
$ sqft_above    <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430, 1370, 1810, 1980, …
$ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0, 760, 720, 0, 0, 0,…
$ yr_built      <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 1927, 1977, 1900, 1979…
$ yr_renovated  <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ zipcode       <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146, 98038, 98007, 98115, 98028, 980…
$ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47.5123, 47.3684, 47.6…
$ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122.315, -122.337, -122…
$ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2390, 2210, 1330, 1780, 1370, 1360, 2140…
$ sqft_lot15    <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, 7570, 8925, 6000, 12697, 10208, 4850, …
skimr::skim(prices)
── Data Summary ────────────────────────
                           Values
Name                       prices
Number of rows             21613 
Number of columns          21    
_______________________          
Column type frequency:           
  character                1     
  numeric                  19    
  POSIXct                  1     
________________________         
Group variables            None  

1. Clean Data

prices_clean <- prices %>% 
  select(-zipcode, -sqft_living15, -sqft_lot15, -id, -date) %>% 
  mutate(waterfront = as.logical(waterfront),
         view = as.factor(view),
         condition = as.factor(condition),
         renovated = case_when(
           yr_renovated > 0 ~ TRUE,
           .default = FALSE
         ),
         basement = case_when(
           sqft_basement > 0 ~ TRUE,
           .default = FALSE
         )) %>% 
  select(-yr_renovated, -sqft_basement)

2. Alias

alias(lm(price ~ ., prices_clean))
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
    waterfront + view + condition + grade + sqft_above + yr_built + 
    lat + long + renovated + basement

3. Model

Round 1

prices_clean %>% 
  select(is.numeric) %>% 
  select(1:6) %>% 
  ggpairs()


prices_clean %>% 
  select(is.numeric) %>% 
  select(1, 7:11) %>% 
  ggpairs()


prices_clean %>% 
  select(1, !is.numeric) %>% 
  ggpairs()

Correlations with price:

  • sqft_living 0.702
  • grade 0.667
  • sqft_above 0.606
  • bathrooms 0.525
  • bedrooms 0.308
  • lat 0.307
  • floors 0.257
  • waterfront ?
mod1a <- lm(price ~ sqft_living, prices_clean)
mod1b <- lm(price ~ grade, prices_clean)
mod1c <- lm(price ~ sqft_above, prices_clean)
autoplot(mod1a)

autoplot(mod1b)

autoplot(mod1c)

summary(mod1a)

Call:
lm(formula = price ~ sqft_living, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1476062  -147486   -24043   106182  4362067 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -43580.743   4402.690  -9.899   <2e-16 ***
sqft_living    280.624      1.936 144.920   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 261500 on 21611 degrees of freedom
Multiple R-squared:  0.4929,    Adjusted R-squared:  0.4928 
F-statistic: 2.1e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
summary(mod1b)

Call:
lm(formula = price ~ grade, data = prices_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-816988 -151958  -36158   97842 6046097 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1056045      12256  -86.17   <2e-16 ***
grade         208458       1582  131.76   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared:  0.4455,    Adjusted R-squared:  0.4454 
F-statistic: 1.736e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
summary(mod1c)

Call:
lm(formula = price ~ sqft_above, data = prices_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-913132 -165624  -41468  109327 5339232 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59953.2     4729.8   12.68   <2e-16 ***
sqft_above     268.5        2.4  111.87   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 292200 on 21611 degrees of freedom
Multiple R-squared:  0.3667,    Adjusted R-squared:  0.3667 
F-statistic: 1.251e+04 on 1 and 21611 DF,  p-value: < 2.2e-16

The residuals vs fitted is best for c (sqft_above), but this has a much poorer adjusted R^2 than the other models.
Sqft_living has the highest correlation and a good R^2.

Round 2

prices_resid <- prices_clean %>% 
  add_residuals(mod1a) %>% 
  select(-price, -sqft_living)
prices_resid %>% 
  select(is.numeric) %>% 
  select(1:4, 10) %>% 
  ggpairs()


prices_resid %>% 
  select(is.numeric) %>% 
  select(5:10) %>% 
  ggpairs()


prices_resid %>% 
  select(15, !is.numeric) %>% 
  ggpairs()

Correlations:

  • waterfront
  • renovated
  • basement
  • lat
  • year_built
mod2a <- lm(price ~ sqft_living + waterfront, prices_clean)
mod2b <- lm(price ~ sqft_living + renovated, prices_clean)
mod2c <- lm(price ~ sqft_living + basement, prices_clean)
mod2d <- lm(price ~ sqft_living + lat, prices_clean)
autoplot(mod2a)

autoplot(mod2b)

autoplot(mod2c)

autoplot(mod2d)

plot(mod2a)

prices_clean %>% 
  #slice(3915)
  #slice(7253)
  #slice(9255)
  slice_max(price, n = 10)
summary(mod2a)

Call:
lm(formula = price ~ sqft_living + waterfront, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1376782  -142867   -21360   107201  4449253 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -32957.851   4242.971  -7.768 8.35e-15 ***
sqft_living       272.507      1.873 145.499  < 2e-16 ***
waterfrontTRUE 829983.104  19882.279  41.745  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 251500 on 21610 degrees of freedom
Multiple R-squared:  0.5307,    Adjusted R-squared:  0.5307 
F-statistic: 1.222e+04 on 2 and 21610 DF,  p-value: < 2.2e-16
summary(mod2b)

Call:
lm(formula = price ~ sqft_living + renovated, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1447169  -146107   -23192   106324  4228135 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -46328.631   4372.000  -10.60   <2e-16 ***
sqft_living      278.693      1.925  144.80   <2e-16 ***
renovatedTRUE 159947.478   8783.478   18.21   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 259500 on 21610 degrees of freedom
Multiple R-squared:  0.5005,    Adjusted R-squared:  0.5005 
F-statistic: 1.083e+04 on 2 and 21610 DF,  p-value: < 2.2e-16
summary(mod2c)

Call:
lm(formula = price ~ sqft_living + basement, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1457685  -146961   -22846   104553  4375783 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -48371.266   4440.011 -10.894  < 2e-16 ***
sqft_living     277.495      1.976 140.468  < 2e-16 ***
basementTRUE  28768.071   3715.285   7.743 1.01e-14 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 261100 on 21610 degrees of freedom
Multiple R-squared:  0.4943,    Adjusted R-squared:  0.4942 
F-statistic: 1.056e+04 on 2 and 21610 DF,  p-value: < 2.2e-16
summary(mod2d)

Call:
lm(formula = price ~ sqft_living + lat, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1487994  -125643   -20309    84613  4368717 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.416e+07  5.653e+05  -60.44   <2e-16 ***
sqft_living  2.749e+02  1.794e+00  153.27   <2e-16 ***
lat          7.177e+05  1.189e+04   60.36   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 241900 on 21610 degrees of freedom
Multiple R-squared:  0.566, Adjusted R-squared:  0.566 
F-statistic: 1.409e+04 on 2 and 21610 DF,  p-value: < 2.2e-16
anova(mod1a, mod2a)
Analysis of Variance Table

Model 1: price ~ sqft_living
Model 2: price ~ sqft_living + waterfront
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1  21611 1.4773e+15                                   
2  21610 1.3670e+15  1 1.1024e+14 1742.6 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(mod1a, mod2d)
Analysis of Variance Table

Model 1: price ~ sqft_living
Model 2: price ~ sqft_living + lat
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1  21611 1.4773e+15                                   
2  21610 1.2641e+15  1 2.1314e+14 3643.5 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Either a(waterfront) or d(lat) would be a good addition. lat seems to improve the model slightly more than waterfront so may be a good one to take forward. The residuals vs leverage for a is alos a little weird.

Round 3

prices_resid <- prices_clean %>% 
  add_residuals(mod2d) %>% 
  select(-price, -sqft_living, -lat)
prices_resid %>% 
  select(is.numeric) %>% 
  select(1:4, 9) %>% 
  ggpairs()


prices_resid %>% 
  select(is.numeric) %>% 
  select(5:9) %>% 
  ggpairs()


prices_resid %>% 
  select(14, !is.numeric) %>% 
  ggpairs()

Correlations:

  • waterfront
  • view?
  • yr_built
prices_clean %>% 
  filter(bedrooms > 30)

33 bedrooms with less than 2 bathrooms, on only 1 floor?? Surely not….

mod3a <- lm(price ~ sqft_living + lat + waterfront, prices_clean)
mod3b <- lm(price ~ sqft_living + lat + view, prices_clean)
mod3c <- lm(price ~ sqft_living + lat + yr_built, prices_clean)
autoplot(mod3a)

autoplot(mod3b)

autoplot(mod3c)

summary(mod3a)

Call:
lm(formula = price ~ sqft_living + lat + waterfront, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1386107  -119931   -17106    84803  4458444 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -3.465e+07  5.387e+05  -64.33   <2e-16 ***
sqft_living     2.665e+02  1.719e+00  155.07   <2e-16 ***
lat             7.282e+05  1.133e+04   64.27   <2e-16 ***
waterfrontTRUE  8.532e+05  1.822e+04   46.83   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 230500 on 21609 degrees of freedom
Multiple R-squared:  0.606, Adjusted R-squared:  0.6059 
F-statistic: 1.108e+04 on 3 and 21609 DF,  p-value: < 2.2e-16
summary(mod3b)

Call:
lm(formula = price ~ sqft_living + lat + view, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1795220  -113556   -14664    81290  4412462 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.419e+07  5.309e+05  -64.40   <2e-16 ***
sqft_living  2.499e+02  1.758e+00  142.14   <2e-16 ***
lat          7.189e+05  1.117e+04   64.38   <2e-16 ***
view1        1.583e+05  1.261e+04   12.55   <2e-16 ***
view2        1.289e+05  7.585e+03   17.00   <2e-16 ***
view3        2.289e+05  1.034e+04   22.13   <2e-16 ***
view4        6.172e+05  1.304e+04   47.35   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 227100 on 21606 degrees of freedom
Multiple R-squared:  0.6175,    Adjusted R-squared:  0.6174 
F-statistic:  5814 on 6 and 21606 DF,  p-value: < 2.2e-16
summary(mod3c)

Call:
lm(formula = price ~ sqft_living + lat + yr_built, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1645242  -120261   -14570    85543  4077885 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.770e+07  5.929e+05  -46.72   <2e-16 ***
sqft_living  2.936e+02  1.861e+00  157.79   <2e-16 ***
lat          6.551e+05  1.182e+04   55.40   <2e-16 ***
yr_built    -1.787e+03  5.875e+01  -30.42   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 236900 on 21609 degrees of freedom
Multiple R-squared:  0.5838,    Adjusted R-squared:  0.5838 
F-statistic: 1.011e+04 on 3 and 21609 DF,  p-value: < 2.2e-16
anova(mod2d, mod3b)
Analysis of Variance Table

Model 1: price ~ sqft_living + lat
Model 2: price ~ sqft_living + lat + view
  Res.Df        RSS Df Sum of Sq      F    Pr(>F)    
1  21610 1.2641e+15                                  
2  21606 1.1141e+15  4   1.5e+14 727.23 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(mod2d, mod3a)
Analysis of Variance Table

Model 1: price ~ sqft_living + lat
Model 2: price ~ sqft_living + lat + waterfront
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1  21610 1.2641e+15                                   
2  21609 1.1477e+15  1 1.1646e+14 2192.7 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

View appears to be the best predictor to add next

Round 4

prices_resid <- prices_clean %>% 
  add_residuals(mod3b) %>% 
  select(-price, -sqft_living, -lat, -view)
prices_resid %>% 
  #select(is.numeric) %>% 
  select(1:6, 13) %>% 
  ggpairs()


prices_resid %>% 
  #select(is.numeric) %>% 
  select(7:13) %>% 
  ggpairs()


# prices_resid %>% 
#   select(14, !is.numeric) %>% 
#   ggpairs()

Correlations:

  • yr_built
  • grade
  • waterfront
  • long
mod4a <- lm(price ~ sqft_living + lat + view + yr_built, prices_clean)
mod4b <- lm(price ~ sqft_living + lat + view + grade, prices_clean)
mod4c <- lm(price ~ sqft_living + lat + view + waterfront, prices_clean)
autoplot(mod4a)

autoplot(mod4b)

autoplot(mod4c)

summary(mod4a)

Call:
lm(formula = price ~ sqft_living + lat + view + yr_built, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1913360  -109916   -10491    83933  4210635 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.918e+07  5.621e+05  -51.92   <2e-16 ***
sqft_living  2.663e+02  1.860e+00  143.22   <2e-16 ***
lat          6.702e+05  1.119e+04   59.88   <2e-16 ***
view1        1.380e+05  1.247e+04   11.07   <2e-16 ***
view2        1.090e+05  7.525e+03   14.49   <2e-16 ***
view3        2.058e+05  1.025e+04   20.08   <2e-16 ***
view4        5.881e+05  1.291e+04   45.55   <2e-16 ***
yr_built    -1.383e+03  5.637e+01  -24.53   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 224000 on 21605 degrees of freedom
Multiple R-squared:  0.6279,    Adjusted R-squared:  0.6278 
F-statistic:  5208 on 7 and 21605 DF,  p-value: < 2.2e-16
summary(mod4b)

Call:
lm(formula = price ~ sqft_living + lat + view + grade, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1430046  -112420   -17585    77081  4765315 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.224e+07  5.144e+05  -62.69   <2e-16 ***
sqft_living  1.736e+02  2.541e+00   68.35   <2e-16 ***
lat          6.684e+05  1.084e+04   61.65   <2e-16 ***
view1        1.589e+05  1.216e+04   13.06   <2e-16 ***
view2        1.190e+05  7.319e+03   16.26   <2e-16 ***
view3        2.131e+05  9.985e+03   21.34   <2e-16 ***
view4        6.008e+05  1.258e+04   47.76   <2e-16 ***
grade        7.958e+04  1.976e+03   40.27   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 219000 on 21605 degrees of freedom
Multiple R-squared:  0.6442,    Adjusted R-squared:  0.6441 
F-statistic:  5589 on 7 and 21605 DF,  p-value: < 2.2e-16
summary(mod4c)

Call:
lm(formula = price ~ sqft_living + lat + view + waterfront, data = prices_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1575666  -113307   -14271    81725  4427530 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -3.453e+07  5.240e+05  -65.89   <2e-16 ***
sqft_living     2.503e+02  1.734e+00  144.32   <2e-16 ***
lat             7.260e+05  1.102e+04   65.87   <2e-16 ***
view1           1.563e+05  1.244e+04   12.56   <2e-16 ***
view2           1.242e+05  7.486e+03   16.59   <2e-16 ***
view3           2.088e+05  1.024e+04   20.39   <2e-16 ***
view4           3.916e+05  1.586e+04   24.69   <2e-16 ***
waterfrontTRUE  5.314e+05  2.186e+04   24.30   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 224000 on 21605 degrees of freedom
Multiple R-squared:  0.6277,    Adjusted R-squared:  0.6276 
F-statistic:  5204 on 7 and 21605 DF,  p-value: < 2.2e-16
anova(mod3b, mod4b)
Analysis of Variance Table

Model 1: price ~ sqft_living + lat + view
Model 2: price ~ sqft_living + lat + view + grade
  Res.Df        RSS Df  Sum of Sq    F    Pr(>F)    
1  21606 1.1141e+15                                 
2  21605 1.0363e+15  1 7.7805e+13 1622 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Final model: price ~ sqft_living + lat + view + grade

Review

Skewed data and transformations

  • Right skewed data can often be log transformed to create a more normal distribution. This can be better for the model. Data bounded by 0 tends to be right skewed, eg. price
mod1log <- lm(ln_house_price ~ ln_sqft_living, prices_log)
mod1a <- lm(price ~ sqft_living, prices)

summary(mod1log)

Call:
lm(formula = ln_house_price ~ ln_sqft_living, data = prices_log)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.10511 -0.29300  0.01262  0.25701  1.33011 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    6.729916   0.047062   143.0   <2e-16 ***
ln_sqft_living 0.836771   0.006223   134.5   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3886 on 21611 degrees of freedom
Multiple R-squared:  0.4555,    Adjusted R-squared:  0.4555 
F-statistic: 1.808e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
summary(mod1a)

Call:
lm(formula = price ~ sqft_living, data = prices)

Residuals:
     Min       1Q   Median       3Q      Max 
-1476062  -147486   -24043   106182  4362067 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -43580.743   4402.690  -9.899   <2e-16 ***
sqft_living    280.624      1.936 144.920   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 261500 on 21611 degrees of freedom
Multiple R-squared:  0.4929,    Adjusted R-squared:  0.4928 
F-statistic: 2.1e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
  • Could calculate ratios and use these as variables

Factoring

  • Grade should be factored. We should consider whether there is a linear relationship between grade and price (ie. is grade 10 twice as good as grade 5?)

prices_clean %>% 
  #mutate(floors = as.factor(floors)) %>% 
  ggplot(aes(floors, price)) +
  geom_point()

  #geom_boxplot()
prices_factored <- prices_clean %>% 
  mutate(grade = as.factor(grade))

mod2fac <- lm(price ~ grade, prices_factored)
mod2unfac <- lm(price ~ grade, prices_clean)

summary(mod2fac)

Call:
lm(formula = price ~ grade, data = prices_factored)

Residuals:
     Min       1Q   Median       3Q      Max 
-1929615  -135853   -35090    89080  5565658 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   142000     254499   0.558 0.576878    
grade3         63667     293870   0.217 0.828484    
grade4         72381     258849   0.280 0.779767    
grade5        106524     255024   0.418 0.676169    
grade6        159920     254561   0.628 0.529868    
grade7        260590     254513   1.024 0.305904    
grade8        400853     254520   1.575 0.115285    
grade9        631513     254547   2.481 0.013112 *  
grade10       929771     254611   3.652 0.000261 ***
grade11      1354842     254817   5.317 1.07e-07 ***
grade12      2049222     255909   8.008 1.23e-15 ***
grade13      3567615     264106  13.508  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 254500 on 21601 degrees of freedom
Multiple R-squared:  0.5197,    Adjusted R-squared:  0.5195 
F-statistic:  2125 on 11 and 21601 DF,  p-value: < 2.2e-16
summary(mod2unfac)

Call:
lm(formula = price ~ grade, data = prices_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-816988 -151958  -36158   97842 6046097 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1056045      12256  -86.17   <2e-16 ***
grade         208458       1582  131.76   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared:  0.4455,    Adjusted R-squared:  0.4454 
F-statistic: 1.736e+04 on 1 and 21611 DF,  p-value: < 2.2e-16

ggpairs

#ggpairs(prices_clean, progress = FALSE) # This removes the text without altering the code chunk settings
#ggsave("ggpairs_1.png", width = 15, height = 15) # You can set dimensions for images

Diagnostics

  • If not all of a categorical variable are significant:
    • you could group categories if this has a real world justification
    • ANOVA
summary(mod2)

Call:
lm(formula = price ~ sqft_above + grade, data = prices_factored)

Residuals:
     Min       1Q   Median       3Q      Max 
-1845587  -136387   -34657    89917  5291001 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.212e+05  2.517e+05   0.482   0.6301    
sqft_above  7.177e+01  3.244e+00  22.123  < 2e-16 ***
grade3      4.166e+04  2.906e+05   0.143   0.8860    
grade4      4.629e+04  2.560e+05   0.181   0.8565    
grade5      5.954e+04  2.522e+05   0.236   0.8134    
grade6      1.040e+05  2.517e+05   0.413   0.6794    
grade7      1.803e+05  2.517e+05   0.716   0.4738    
grade8      2.876e+05  2.517e+05   1.143   0.2532    
grade9      4.690e+05  2.518e+05   1.862   0.0626 .  
grade10     7.272e+05  2.519e+05   2.886   0.0039 ** 
grade11     1.099e+06  2.522e+05   4.357 1.32e-05 ***
grade12     1.736e+06  2.535e+05   6.851 7.55e-12 ***
grade13     3.153e+06  2.618e+05  12.043  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 251700 on 21600 degrees of freedom
Multiple R-squared:  0.5303,    Adjusted R-squared:  0.5301 
F-statistic:  2033 on 12 and 21600 DF,  p-value: < 2.2e-16
anova(mod1, mod2)
Analysis of Variance Table

Model 1: price ~ sqft_above
Model 2: price ~ sqft_above + grade
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1  21611 1.8447e+15                                   
2  21600 1.3681e+15 11 4.7663e+14 684.11 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

The whole of grade is significant and is an improvement even through not every individual grade is significant.

  • We see a cone in residuals vs fitted. A log transform may help. When modelling factored grades we get groups, we want to avoid this too. We may need to add more variables to account for the differences better.
  • Q-Q plot. There is asymmetry and a large variation from the diagonal line. This may indicate a log transformation is needed as well.
  • Scale-location = how wrong? We do not want this diagonal line
  • If the diagnostic plots fail - we may need to add more variables, not ditch the whole model! The question is - is this model ready?
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShqYW5pdG9yKQ0KbGlicmFyeShnZ2ZvcnRpZnkpDQpsaWJyYXJ5KEdHYWxseSkNCmxpYnJhcnkobW9kZWxyKQ0KYGBgDQoNCg0KYGBge3J9DQpwcmljZXMgPC0gcmVhZF9jc3YoImRhdGEva2NfaG91c2VfZGF0YS5jc3YiKQ0KcHJpY2VzDQpgYGANCmBgYHtyfQ0KZ2xpbXBzZShwcmljZXMpDQpgYGANCmBgYHtyfQ0Kc2tpbXI6OnNraW0ocHJpY2VzKQ0KYGBgDQojIDEuIENsZWFuIERhdGENCmBgYHtyfQ0KcHJpY2VzX2NsZWFuIDwtIHByaWNlcyAlPiUgDQogIHNlbGVjdCgtemlwY29kZSwgLXNxZnRfbGl2aW5nMTUsIC1zcWZ0X2xvdDE1LCAtaWQsIC1kYXRlKSAlPiUgDQogIG11dGF0ZSh3YXRlcmZyb250ID0gYXMubG9naWNhbCh3YXRlcmZyb250KSwNCiAgICAgICAgIHZpZXcgPSBhcy5mYWN0b3IodmlldyksDQogICAgICAgICBjb25kaXRpb24gPSBhcy5mYWN0b3IoY29uZGl0aW9uKSwNCiAgICAgICAgIHJlbm92YXRlZCA9IGNhc2Vfd2hlbigNCiAgICAgICAgICAgeXJfcmVub3ZhdGVkID4gMCB+IFRSVUUsDQogICAgICAgICAgIC5kZWZhdWx0ID0gRkFMU0UNCiAgICAgICAgICksDQogICAgICAgICBiYXNlbWVudCA9IGNhc2Vfd2hlbigNCiAgICAgICAgICAgc3FmdF9iYXNlbWVudCA+IDAgfiBUUlVFLA0KICAgICAgICAgICAuZGVmYXVsdCA9IEZBTFNFDQogICAgICAgICApKSAlPiUgDQogIHNlbGVjdCgteXJfcmVub3ZhdGVkLCAtc3FmdF9iYXNlbWVudCkNCmBgYA0KDQojIDIuIEFsaWFzDQpgYGB7cn0NCmFsaWFzKGxtKHByaWNlIH4gLiwgcHJpY2VzX2NsZWFuKSkNCmBgYA0KIyAzLiBNb2RlbA0KDQojIyBSb3VuZCAxDQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KcHJpY2VzX2NsZWFuICU+JSANCiAgc2VsZWN0KGlzLm51bWVyaWMpICU+JSANCiAgc2VsZWN0KDE6NikgJT4lIA0KICBnZ3BhaXJzKCkNCg0KcHJpY2VzX2NsZWFuICU+JSANCiAgc2VsZWN0KGlzLm51bWVyaWMpICU+JSANCiAgc2VsZWN0KDEsIDc6MTEpICU+JSANCiAgZ2dwYWlycygpDQoNCnByaWNlc19jbGVhbiAlPiUgDQogIHNlbGVjdCgxLCAhaXMubnVtZXJpYykgJT4lIA0KICBnZ3BhaXJzKCkNCmBgYA0KQ29ycmVsYXRpb25zIHdpdGggcHJpY2U6DQoNCiogc3FmdF9saXZpbmcgICAwLjcwMg0KKiBncmFkZSAgICAgICAgIDAuNjY3DQoqIHNxZnRfYWJvdmUgICAgMC42MDYNCiogYmF0aHJvb21zICAgICAwLjUyNQ0KKiBiZWRyb29tcyAgICAgIDAuMzA4DQoqIGxhdCAgICAgICAgICAgMC4zMDcNCiogZmxvb3JzICAgICAgICAwLjI1Nw0KKiB3YXRlcmZyb250ICAgID8NCg0KYGBge3J9DQptb2QxYSA8LSBsbShwcmljZSB+IHNxZnRfbGl2aW5nLCBwcmljZXNfY2xlYW4pDQptb2QxYiA8LSBsbShwcmljZSB+IGdyYWRlLCBwcmljZXNfY2xlYW4pDQptb2QxYyA8LSBsbShwcmljZSB+IHNxZnRfYWJvdmUsIHByaWNlc19jbGVhbikNCmF1dG9wbG90KG1vZDFhKQ0KYXV0b3Bsb3QobW9kMWIpDQphdXRvcGxvdChtb2QxYykNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkobW9kMWEpDQpzdW1tYXJ5KG1vZDFiKQ0Kc3VtbWFyeShtb2QxYykNCmBgYA0KDQpUaGUgcmVzaWR1YWxzIHZzIGZpdHRlZCBpcyBiZXN0IGZvciBjIChzcWZ0X2Fib3ZlKSwgYnV0IHRoaXMgaGFzIGEgbXVjaCBwb29yZXIgYWRqdXN0ZWQgUl4yIHRoYW4gdGhlIG90aGVyIG1vZGVscy4gIA0KU3FmdF9saXZpbmcgaGFzIHRoZSBoaWdoZXN0IGNvcnJlbGF0aW9uIGFuZCBhIGdvb2QgUl4yLg0KDQojIyBSb3VuZCAyDQoNCmBgYHtyfQ0KcHJpY2VzX3Jlc2lkIDwtIHByaWNlc19jbGVhbiAlPiUgDQogIGFkZF9yZXNpZHVhbHMobW9kMWEpICU+JSANCiAgc2VsZWN0KC1wcmljZSwgLXNxZnRfbGl2aW5nKQ0KYGBgDQoNCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpwcmljZXNfcmVzaWQgJT4lIA0KICBzZWxlY3QoaXMubnVtZXJpYykgJT4lIA0KICBzZWxlY3QoMTo0LCAxMCkgJT4lIA0KICBnZ3BhaXJzKCkNCg0KcHJpY2VzX3Jlc2lkICU+JSANCiAgc2VsZWN0KGlzLm51bWVyaWMpICU+JSANCiAgc2VsZWN0KDU6MTApICU+JSANCiAgZ2dwYWlycygpDQoNCnByaWNlc19yZXNpZCAlPiUgDQogIHNlbGVjdCgxNSwgIWlzLm51bWVyaWMpICU+JSANCiAgZ2dwYWlycygpDQpgYGANCkNvcnJlbGF0aW9uczoNCg0KKiB3YXRlcmZyb250DQoqIHJlbm92YXRlZA0KKiBiYXNlbWVudA0KKiBsYXQNCiogeWVhcl9idWlsdA0KDQpgYGB7cn0NCm1vZDJhIDwtIGxtKHByaWNlIH4gc3FmdF9saXZpbmcgKyB3YXRlcmZyb250LCBwcmljZXNfY2xlYW4pDQptb2QyYiA8LSBsbShwcmljZSB+IHNxZnRfbGl2aW5nICsgcmVub3ZhdGVkLCBwcmljZXNfY2xlYW4pDQptb2QyYyA8LSBsbShwcmljZSB+IHNxZnRfbGl2aW5nICsgYmFzZW1lbnQsIHByaWNlc19jbGVhbikNCm1vZDJkIDwtIGxtKHByaWNlIH4gc3FmdF9saXZpbmcgKyBsYXQsIHByaWNlc19jbGVhbikNCmF1dG9wbG90KG1vZDJhKQ0KYXV0b3Bsb3QobW9kMmIpDQphdXRvcGxvdChtb2QyYykNCmF1dG9wbG90KG1vZDJkKQ0KYGBgDQpgYGB7cn0NCnBsb3QobW9kMmEpDQpgYGANCmBgYHtyfQ0KcHJpY2VzX2NsZWFuICU+JSANCiAgI3NsaWNlKDM5MTUpDQogICNzbGljZSg3MjUzKQ0KICAjc2xpY2UoOTI1NSkNCiAgc2xpY2VfbWF4KHByaWNlLCBuID0gMTApDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KG1vZDJhKQ0Kc3VtbWFyeShtb2QyYikNCnN1bW1hcnkobW9kMmMpDQpzdW1tYXJ5KG1vZDJkKQ0KYGBgDQpgYGB7cn0NCmFub3ZhKG1vZDFhLCBtb2QyYSkNCmFub3ZhKG1vZDFhLCBtb2QyZCkNCmBgYA0KRWl0aGVyIGEod2F0ZXJmcm9udCkgb3IgZChsYXQpIHdvdWxkIGJlIGEgZ29vZCBhZGRpdGlvbi4gbGF0IHNlZW1zIHRvIGltcHJvdmUgdGhlIG1vZGVsIHNsaWdodGx5IG1vcmUgdGhhbiB3YXRlcmZyb250IHNvIG1heSBiZSBhIGdvb2Qgb25lIHRvIHRha2UgZm9yd2FyZC4gVGhlIHJlc2lkdWFscyB2cyBsZXZlcmFnZSBmb3IgYSBpcyBhbG9zIGEgbGl0dGxlIHdlaXJkLg0KDQojIyBSb3VuZCAzDQoNCmBgYHtyfQ0KcHJpY2VzX3Jlc2lkIDwtIHByaWNlc19jbGVhbiAlPiUgDQogIGFkZF9yZXNpZHVhbHMobW9kMmQpICU+JSANCiAgc2VsZWN0KC1wcmljZSwgLXNxZnRfbGl2aW5nLCAtbGF0KQ0KYGBgDQoNCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpwcmljZXNfcmVzaWQgJT4lIA0KICBzZWxlY3QoaXMubnVtZXJpYykgJT4lIA0KICBzZWxlY3QoMTo0LCA5KSAlPiUgDQogIGdncGFpcnMoKQ0KDQpwcmljZXNfcmVzaWQgJT4lIA0KICBzZWxlY3QoaXMubnVtZXJpYykgJT4lIA0KICBzZWxlY3QoNTo5KSAlPiUgDQogIGdncGFpcnMoKQ0KDQpwcmljZXNfcmVzaWQgJT4lIA0KICBzZWxlY3QoMTQsICFpcy5udW1lcmljKSAlPiUgDQogIGdncGFpcnMoKQ0KYGBgDQpDb3JyZWxhdGlvbnM6DQoNCiogd2F0ZXJmcm9udA0KKiB2aWV3Pw0KKiB5cl9idWlsdA0KDQpgYGB7cn0NCnByaWNlc19jbGVhbiAlPiUgDQogIGZpbHRlcihiZWRyb29tcyA+IDMwKQ0KYGBgDQozMyBiZWRyb29tcyB3aXRoIGxlc3MgdGhhbiAyIGJhdGhyb29tcywgb24gb25seSAxIGZsb29yPz8gU3VyZWx5IG5vdC4uLi4NCg0KYGBge3J9DQptb2QzYSA8LSBsbShwcmljZSB+IHNxZnRfbGl2aW5nICsgbGF0ICsgd2F0ZXJmcm9udCwgcHJpY2VzX2NsZWFuKQ0KbW9kM2IgPC0gbG0ocHJpY2UgfiBzcWZ0X2xpdmluZyArIGxhdCArIHZpZXcsIHByaWNlc19jbGVhbikNCm1vZDNjIDwtIGxtKHByaWNlIH4gc3FmdF9saXZpbmcgKyBsYXQgKyB5cl9idWlsdCwgcHJpY2VzX2NsZWFuKQ0KYGBgDQoNCmBgYHtyfQ0KYXV0b3Bsb3QobW9kM2EpDQphdXRvcGxvdChtb2QzYikNCmF1dG9wbG90KG1vZDNjKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShtb2QzYSkNCnN1bW1hcnkobW9kM2IpDQpzdW1tYXJ5KG1vZDNjKQ0KYGBgDQoNCmBgYHtyfQ0KYW5vdmEobW9kMmQsIG1vZDNiKQ0KYW5vdmEobW9kMmQsIG1vZDNhKQ0KYGBgDQoNClZpZXcgYXBwZWFycyB0byBiZSB0aGUgYmVzdCBwcmVkaWN0b3IgdG8gYWRkIG5leHQNCg0KIyMgUm91bmQgNA0KDQpgYGB7cn0NCnByaWNlc19yZXNpZCA8LSBwcmljZXNfY2xlYW4gJT4lIA0KICBhZGRfcmVzaWR1YWxzKG1vZDNiKSAlPiUgDQogIHNlbGVjdCgtcHJpY2UsIC1zcWZ0X2xpdmluZywgLWxhdCwgLXZpZXcpDQpgYGANCg0KYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCnByaWNlc19yZXNpZCAlPiUgDQogICNzZWxlY3QoaXMubnVtZXJpYykgJT4lIA0KICBzZWxlY3QoMTo2LCAxMykgJT4lIA0KICBnZ3BhaXJzKCkNCg0KcHJpY2VzX3Jlc2lkICU+JSANCiAgI3NlbGVjdChpcy5udW1lcmljKSAlPiUgDQogIHNlbGVjdCg3OjEzKSAlPiUgDQogIGdncGFpcnMoKQ0KDQojIHByaWNlc19yZXNpZCAlPiUgDQojICAgc2VsZWN0KDE0LCAhaXMubnVtZXJpYykgJT4lIA0KIyAgIGdncGFpcnMoKQ0KYGBgDQpDb3JyZWxhdGlvbnM6DQoNCiogeXJfYnVpbHQNCiogZ3JhZGUNCiogd2F0ZXJmcm9udA0KKiBsb25nDQoNCmBgYHtyfQ0KbW9kNGEgPC0gbG0ocHJpY2UgfiBzcWZ0X2xpdmluZyArIGxhdCArIHZpZXcgKyB5cl9idWlsdCwgcHJpY2VzX2NsZWFuKQ0KbW9kNGIgPC0gbG0ocHJpY2UgfiBzcWZ0X2xpdmluZyArIGxhdCArIHZpZXcgKyBncmFkZSwgcHJpY2VzX2NsZWFuKQ0KbW9kNGMgPC0gbG0ocHJpY2UgfiBzcWZ0X2xpdmluZyArIGxhdCArIHZpZXcgKyB3YXRlcmZyb250LCBwcmljZXNfY2xlYW4pDQpgYGANCg0KYGBge3J9DQphdXRvcGxvdChtb2Q0YSkNCmF1dG9wbG90KG1vZDRiKQ0KYXV0b3Bsb3QobW9kNGMpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KG1vZDRhKQ0Kc3VtbWFyeShtb2Q0YikNCnN1bW1hcnkobW9kNGMpDQpgYGANCg0KYGBge3J9DQphbm92YShtb2QzYiwgbW9kNGIpDQpgYGANCg0KRmluYWwgbW9kZWw6IHByaWNlIH4gc3FmdF9saXZpbmcgKyBsYXQgKyB2aWV3ICsgZ3JhZGUNCg0KDQojIFJldmlldw0KDQojIyBTa2V3ZWQgZGF0YSBhbmQgdHJhbnNmb3JtYXRpb25zDQoqIFJpZ2h0IHNrZXdlZCBkYXRhIGNhbiBvZnRlbiBiZSBsb2cgdHJhbnNmb3JtZWQgdG8gY3JlYXRlIGEgbW9yZSBub3JtYWwgZGlzdHJpYnV0aW9uLiBUaGlzIGNhbiBiZSBiZXR0ZXIgZm9yIHRoZSBtb2RlbC4NCkRhdGEgYm91bmRlZCBieSAwIHRlbmRzIHRvIGJlIHJpZ2h0IHNrZXdlZCwgZWcuIHByaWNlDQoNCmBgYHtyfQ0KcHJpY2VzX2NsZWFuICU+JSANCiAgZ2dwbG90KGFlcyhwcmljZSkpICsNCiAgZ2VvbV9oaXN0b2dyYW0oKSArDQogIHNjYWxlX3hfY29udGludW91cyh0cmFucyA9ICJsb2cxMCIpDQoNCnByaWNlc19sb2cgPC0gcHJpY2VzX2NsZWFuICU+JSANCiAgbXV0YXRlKGxuX2hvdXNlX3ByaWNlID0gbG9nKHByaWNlKSwNCiAgICAgICAgIGxuX3NxZnRfbGl2aW5nID0gbG9nKHNxZnRfbGl2aW5nKSkNCmBgYA0KDQpgYGB7cn0NCm1vZDFsb2cgPC0gbG0obG5faG91c2VfcHJpY2UgfiBzcWZ0X2xpdmluZywgcHJpY2VzX2xvZykNCm1vZDFhIDwtIGxtKHByaWNlIH4gc3FmdF9saXZpbmcsIHByaWNlcykNCg0Kc3VtbWFyeShtb2QxbG9nKQ0Kc3VtbWFyeShtb2QxYSkNCmBgYA0KKiBDb3VsZCBjYWxjdWxhdGUgcmF0aW9zIGFuZCB1c2UgdGhlc2UgYXMgdmFyaWFibGVzDQoNCiMjIEZhY3RvcmluZw0KKiBHcmFkZSBzaG91bGQgYmUgZmFjdG9yZWQuIFdlIHNob3VsZCBjb25zaWRlciB3aGV0aGVyIHRoZXJlIGlzIGEgbGluZWFyIHJlbGF0aW9uc2hpcCBiZXR3ZWVuIGdyYWRlIGFuZCBwcmljZSAoaWUuIGlzIGdyYWRlIDEwIHR3aWNlIGFzIGdvb2QgYXMgZ3JhZGUgNT8pDQoNCmBgYHtyfQ0KcHJpY2VzX2NsZWFuICU+JSANCiAgZ2dwbG90KGFlcyhncmFkZSwgcHJpY2UpKSArDQogIGdlb21fcG9pbnQoKQ0KYGBgDQpgYGB7cn0NCnByaWNlc19jbGVhbiAlPiUNCiAgbXV0YXRlKGdyYWRlID0gYXMuZmFjdG9yKGdyYWRlKSkgJT4lIA0KICBnZ3Bsb3QoYWVzKGdyYWRlLCBwcmljZSkpICsNCiAgZ2VvbV9ib3hwbG90KCkNCmBgYA0KYGBge3J9DQpwcmljZXNfY2xlYW4gJT4lIA0KICAjbXV0YXRlKGZsb29ycyA9IGFzLmZhY3RvcihmbG9vcnMpKSAlPiUgDQogIGdncGxvdChhZXMoZmxvb3JzLCBwcmljZSkpICsNCiAgZ2VvbV9wb2ludCgpDQogICNnZW9tX2JveHBsb3QoKQ0KYGBgDQoNCmBgYHtyfQ0KcHJpY2VzX2ZhY3RvcmVkIDwtIHByaWNlc19jbGVhbiAlPiUgDQogIG11dGF0ZShncmFkZSA9IGFzLmZhY3RvcihncmFkZSkpDQoNCm1vZDJmYWMgPC0gbG0ocHJpY2UgfiBncmFkZSwgcHJpY2VzX2ZhY3RvcmVkKQ0KbW9kMnVuZmFjIDwtIGxtKHByaWNlIH4gZ3JhZGUsIHByaWNlc19jbGVhbikNCg0Kc3VtbWFyeShtb2QyZmFjKQ0Kc3VtbWFyeShtb2QydW5mYWMpDQpgYGANCiMjIGdncGFpcnMNCmBgYHtyfQ0KI2dncGFpcnMocHJpY2VzX2NsZWFuLCBwcm9ncmVzcyA9IEZBTFNFKSAjIFRoaXMgcmVtb3ZlcyB0aGUgdGV4dCB3aXRob3V0IGFsdGVyaW5nIHRoZSBjb2RlIGNodW5rIHNldHRpbmdzDQojZ2dzYXZlKCJnZ3BhaXJzXzEucG5nIiwgd2lkdGggPSAxNSwgaGVpZ2h0ID0gMTUpICMgWW91IGNhbiBzZXQgZGltZW5zaW9ucyBmb3IgaW1hZ2VzDQpgYGANCg0KIyMgRGlhZ25vc3RpY3MNCg0KKiBJZiBub3QgYWxsIG9mIGEgY2F0ZWdvcmljYWwgdmFyaWFibGUgYXJlIHNpZ25pZmljYW50Og0KICAtIHlvdSBjb3VsZCBncm91cCBjYXRlZ29yaWVzIGlmIHRoaXMgaGFzIGEgcmVhbCB3b3JsZCBqdXN0aWZpY2F0aW9uDQogIC0gQU5PVkENCg0KYGBge3J9DQptb2QxIDwtIGxtKHByaWNlIH4gc3FmdF9hYm92ZSwgcHJpY2VzX2ZhY3RvcmVkKQ0KbW9kMiA8LWxtKHByaWNlIH4gc3FmdF9hYm92ZSArIGdyYWRlLCBwcmljZXNfZmFjdG9yZWQpDQoNCnN1bW1hcnkobW9kMikNCmBgYA0KYGBge3J9DQphbm92YShtb2QxLCBtb2QyKQ0KYGBgDQpUaGUgd2hvbGUgb2YgZ3JhZGUgaXMgc2lnbmlmaWNhbnQgYW5kIGlzIGFuIGltcHJvdmVtZW50IGV2ZW4gdGhyb3VnaCBub3QgZXZlcnkgaW5kaXZpZHVhbCBncmFkZSBpcyBzaWduaWZpY2FudC4NCg0KKiBXZSBzZWUgYSBjb25lIGluIHJlc2lkdWFscyB2cyBmaXR0ZWQuIEEgbG9nIHRyYW5zZm9ybSBtYXkgaGVscC4gV2hlbiBtb2RlbGxpbmcgZmFjdG9yZWQgZ3JhZGVzIHdlIGdldCBncm91cHMsIHdlIHdhbnQgdG8gYXZvaWQgdGhpcyB0b28uIFdlIG1heSBuZWVkIHRvIGFkZCBtb3JlIHZhcmlhYmxlcyB0byBhY2NvdW50IGZvciB0aGUgZGlmZmVyZW5jZXMgYmV0dGVyLg0KKiBRLVEgcGxvdC4gVGhlcmUgaXMgYXN5bW1ldHJ5IGFuZCBhIGxhcmdlIHZhcmlhdGlvbiBmcm9tIHRoZSBkaWFnb25hbCBsaW5lLiBUaGlzIG1heSBpbmRpY2F0ZSBhIGxvZyB0cmFuc2Zvcm1hdGlvbiBpcyBuZWVkZWQgYXMgd2VsbC4NCiogU2NhbGUtbG9jYXRpb24gPSBob3cgd3Jvbmc/IFdlIGRvIG5vdCB3YW50IHRoaXMgZGlhZ29uYWwgbGluZQ0KKiBJZiB0aGUgZGlhZ25vc3RpYyBwbG90cyBmYWlsIC0gd2UgbWF5IG5lZWQgdG8gYWRkIG1vcmUgdmFyaWFibGVzLCBub3QgZGl0Y2ggdGhlIHdob2xlIG1vZGVsISBUaGUgcXVlc3Rpb24gaXMgLSBpcyB0aGlzIG1vZGVsIHJlYWR5Pw0K